First we have to fetch the data. The source used here is https://covidtracking.com. The provide an API for fetching data by state. In the following data chunk, we end up with a list of lists. The number of measurements is shown along with the structure of a measurement.
library(httr)
req <- GET("https://covidtracking.com/api/v1/states/daily.json")
data <- content(req, as = "parsed")
length(data)
## [1] 11186
str(data[[1]])
## List of 54
## $ date : int 20200919
## $ state : chr "AK"
## $ positive : int 7674
## $ negative : int 415176
## $ pending : NULL
## $ totalTestResults : int 422850
## $ hospitalizedCurrently : int 41
## $ hospitalizedCumulative : NULL
## $ inIcuCurrently : NULL
## $ inIcuCumulative : NULL
## $ onVentilatorCurrently : int 13
## $ onVentilatorCumulative : NULL
## $ recovered : int 2438
## $ dataQualityGrade : chr "A"
## $ lastUpdateEt : chr "9/19/2020 03:59"
## $ dateModified : chr "2020-09-19T03:59:00Z"
## $ checkTimeEt : chr "09/18 23:59"
## $ death : int 45
## $ hospitalized : NULL
## $ dateChecked : chr "2020-09-19T03:59:00Z"
## $ totalTestsViral : int 422850
## $ positiveTestsViral : int 6970
## $ negativeTestsViral : int 415600
## $ positiveCasesViral : int 7674
## $ deathConfirmed : int 45
## $ deathProbable : NULL
## $ totalTestEncountersViral : NULL
## $ totalTestsPeopleViral : NULL
## $ totalTestsAntibody : NULL
## $ positiveTestsAntibody : NULL
## $ negativeTestsAntibody : NULL
## $ totalTestsPeopleAntibody : NULL
## $ positiveTestsPeopleAntibody: NULL
## $ negativeTestsPeopleAntibody: NULL
## $ totalTestsPeopleAntigen : NULL
## $ positiveTestsPeopleAntigen : NULL
## $ totalTestsAntigen : NULL
## $ positiveTestsAntigen : NULL
## $ fips : chr "02"
## $ positiveIncrease : int 87
## $ negativeIncrease : int 4470
## $ total : int 422850
## $ totalTestResultsSource : chr "posNeg"
## $ totalTestResultsIncrease : int 4557
## $ posNeg : int 422850
## $ deathIncrease : int 0
## $ hospitalizedIncrease : int 0
## $ hash : chr "934611e27c6c701c808cf53976fe9a48c80fd513"
## $ commercialScore : int 0
## $ negativeRegularScore : int 0
## $ negativeScore : int 0
## $ positiveScore : int 0
## $ score : int 0
## $ grade : chr ""
library(httr)
library(data.table)
info_req <- GET("https://covidtracking.com/api/v1/states/info.json")
info <- content(info_req, as = "parsed")
length(info)
## [1] 56
str(info[[1]])
## List of 14
## $ state : chr "AK"
## $ notes : chr "Alaska combines PCR and antigen tests in the total tests figure reported on the state's dashboard.\n\nAlaska re"| __truncated__
## $ covid19Site : chr "http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/monitoring.aspx"
## $ covid19SiteSecondary : chr "https://experience.arcgis.com/experience/ed1c874ca60b4c15ab09095a070065ca"
## $ covid19SiteTertiary : chr "https://alaska-dhss.maps.arcgis.com/apps/opsdashboard/index.html#/8782a14ef52342e99f866a3b8a3e624a"
## $ twitter : chr "@Alaska_DHSS"
## $ covid19SiteOld : chr "http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx"
## $ covidTrackingProjectPreferredTotalTestUnits: chr "Specimens"
## $ covidTrackingProjectPreferredTotalTestField: chr "totalTestsViral"
## $ totalTestResultsField : chr "Total Tests (PCR)"
## $ name : chr "Alaska"
## $ fips : chr "02"
## $ pui : chr ""
## $ pum : logi FALSE
state_info <- rbindlist(as.list(info), fill=TRUE)
get_state_name <- function(abbr) {
state_info[state_info$state == abbr]$name
}
library(data.table)
library(ggplot2)
dt <- rbindlist(data, fill=TRUE)
## Warning in rbindlist(data, fill = TRUE): Column 5 ['pending'] of item 1 is
## length 0. This (and 225969 others like it) has been filled with NA (NULL for
## list columns) to make each item uniform.
dt$date <- as.Date(as.character(dt$date), format = "%Y%m%d")
dt$dateChecked <- as.POSIXct(dt$dateChecked, "%Y-%m-%dT%H:%M:%S", tz="UTC")
dt$state <- as.factor(dt$state)
dt$fips <- as.factor(dt$fips)
dt <- dt[, -"hash"]
str(dt[1,])
## Classes 'data.table' and 'data.frame': 1 obs. of 53 variables:
## $ date : Date, format: "2020-09-19"
## $ state : Factor w/ 56 levels "AK","AL","AR",..: 1
## $ positive : int 7674
## $ negative : int 415176
## $ pending : int NA
## $ totalTestResults : int 422850
## $ hospitalizedCurrently : int 41
## $ hospitalizedCumulative : int NA
## $ inIcuCurrently : int NA
## $ inIcuCumulative : int NA
## $ onVentilatorCurrently : int 13
## $ onVentilatorCumulative : int NA
## $ recovered : int 2438
## $ dataQualityGrade : chr "A"
## $ lastUpdateEt : chr "9/19/2020 03:59"
## $ dateModified : chr "2020-09-19T03:59:00Z"
## $ checkTimeEt : chr "09/18 23:59"
## $ death : int 45
## $ hospitalized : int NA
## $ dateChecked : POSIXct, format: "2020-09-19 03:59:00"
## $ totalTestsViral : int 422850
## $ positiveTestsViral : int 6970
## $ negativeTestsViral : int 415600
## $ positiveCasesViral : int 7674
## $ deathConfirmed : int 45
## $ deathProbable : int NA
## $ totalTestEncountersViral : num NA
## $ totalTestsPeopleViral : int NA
## $ totalTestsAntibody : int NA
## $ positiveTestsAntibody : int NA
## $ negativeTestsAntibody : int NA
## $ totalTestsPeopleAntibody : int NA
## $ positiveTestsPeopleAntibody: int NA
## $ negativeTestsPeopleAntibody: int NA
## $ totalTestsPeopleAntigen : int NA
## $ positiveTestsPeopleAntigen : int NA
## $ totalTestsAntigen : int NA
## $ positiveTestsAntigen : int NA
## $ fips : Factor w/ 56 levels "01","02","04",..: 2
## $ positiveIncrease : int 87
## $ negativeIncrease : int 4470
## $ total : int 422850
## $ totalTestResultsSource : chr "posNeg"
## $ totalTestResultsIncrease : int 4557
## $ posNeg : int 422850
## $ deathIncrease : int 0
## $ hospitalizedIncrease : int 0
## $ commercialScore : int 0
## $ negativeRegularScore : int 0
## $ negativeScore : int 0
## $ positiveScore : int 0
## $ score : int 0
## $ grade : chr ""
## - attr(*, ".internal.selfref")=<externalptr>
picked <- dt[, c("state", "positive", "negative", "recovered","death", "total")]
pairs(picked)
normalized <- cbind.data.frame(dt$positive/dt$totalTestResults, dt$negative/dt$totalTestResults, dt$recovered/dt$total, dt$death/dt$total)
colnames(normalized) <- c("pos", "neg", "recovered", "dead")
pairs(normalized)
library(ggplot2)
library(cowplot)
plot_list <- list()
for(s in levels(dt$state)) {
by_state <- dt[dt$state == s,]
date_order <- order(by_state$date)
by_state_ordered_by_date <- by_state[date_order,]
# to find the earliest meausrement, restrict by complete cases
# the date of element 0.
earliest_death_report <- na.omit(by_state_ordered_by_date[,c("date","death")])[1,]
earliest_death <- by_state_ordered_by_date[by_state_ordered_by_date$death > 0,c("date","death")][1,]
print(data.frame(state = s, earliest_report = earliest_death_report$date, earliest_death = earliest_death$date, earliest_number =earliest_death$death ))
setnafill(by_state_ordered_by_date,type="locf", cols=c("death"))
setnafill(by_state_ordered_by_date,type="const", fill=0, cols=c("death"))
integral_breaks <- function(values) {
breaks <- unique(floor(pretty(seq(0, 1.1 * max(values) + 1))))
return(breaks)
}
y_breaks <- integral_breaks(
by_state_ordered_by_date$death
)
p <- ggplot(by_state_ordered_by_date, aes(x=date, y=death)) +
geom_point(shape=1) +
scale_y_continuous(breaks = y_breaks)
# ggtitle(get_state_name(s))
plot_list[[s]] <- p
}
## state earliest_report earliest_death earliest_number
## 1 AK 2020-03-06 2020-03-25 1
## state earliest_report earliest_death earliest_number
## 1 AL 2020-03-15 2020-03-26 1
## state earliest_report earliest_death earliest_number
## 1 AR 2020-03-22 2020-03-25 2
## state earliest_report earliest_death earliest_number
## 1 AS 2020-03-18 <NA> NA
## state earliest_report earliest_death earliest_number
## 1 AZ 2020-03-13 2020-03-21 1
## state earliest_report earliest_death earliest_number
## 1 CA 2020-03-12 2020-03-12 4
## state earliest_report earliest_death earliest_number
## 1 CO 2020-03-14 2020-03-14 1
## state earliest_report earliest_death earliest_number
## 1 CT 2020-03-19 2020-03-19 1
## state earliest_report earliest_death earliest_number
## 1 DC 2020-03-19 2020-03-20 1
## state earliest_report earliest_death earliest_number
## 1 DE 2020-03-08 2020-03-24 1
## state earliest_report earliest_death earliest_number
## 1 FL 2020-03-11 2020-03-11 2
## state earliest_report earliest_death earliest_number
## 1 GA 2020-03-13 2020-03-13 1
## state earliest_report earliest_death earliest_number
## 1 GU 2020-03-22 2020-03-22 1
## state earliest_report earliest_death earliest_number
## 1 HI 2020-04-01 2020-04-01 1
## state earliest_report earliest_death earliest_number
## 1 IA 2020-03-25 2020-03-25 1
## state earliest_report earliest_death earliest_number
## 1 ID 2020-03-22 2020-03-27 3
## state earliest_report earliest_death earliest_number
## 1 IL 2020-03-17 2020-03-17 1
## state earliest_report earliest_death earliest_number
## 1 IN 2020-03-14 2020-03-16 1
## state earliest_report earliest_death earliest_number
## 1 KS 2020-03-14 2020-03-14 1
## state earliest_report earliest_death earliest_number
## 1 KY 2020-03-14 2020-03-14 1
## state earliest_report earliest_death earliest_number
## 1 LA 2020-03-15 2020-03-15 2
## state earliest_report earliest_death earliest_number
## 1 MA 2020-03-18 2020-03-18 2
## state earliest_report earliest_death earliest_number
## 1 MD 2020-03-16 2020-03-16 2
## state earliest_report earliest_death earliest_number
## 1 ME 2020-03-27 2020-03-27 1
## state earliest_report earliest_death earliest_number
## 1 MI 2020-03-17 2020-03-17 2
## state earliest_report earliest_death earliest_number
## 1 MN 2020-03-21 2020-03-21 1
## state earliest_report earliest_death earliest_number
## 1 MO 2020-03-14 2020-03-19 1
## state earliest_report earliest_death earliest_number
## 1 MP 2020-03-18 2020-04-02 1
## state earliest_report earliest_death earliest_number
## 1 MS 2020-03-20 2020-03-20 1
## state earliest_report earliest_death earliest_number
## 1 MT 2020-03-28 2020-03-28 1
## state earliest_report earliest_death earliest_number
## 1 NC 2020-03-16 2020-03-25 1
## state earliest_report earliest_death earliest_number
## 1 ND 2020-03-13 2020-03-28 1
## state earliest_report earliest_death earliest_number
## 1 NE 2020-03-19 2020-03-28 2
## state earliest_report earliest_death earliest_number
## 1 NH 2020-03-24 2020-03-24 1
## state earliest_report earliest_death earliest_number
## 1 NJ 2020-02-10 2020-03-11 1
## state earliest_report earliest_death earliest_number
## 1 NM 2020-03-25 2020-03-25 1
## state earliest_report earliest_death earliest_number
## 1 NV 2020-03-05 2020-03-15 1
## state earliest_report earliest_death earliest_number
## 1 NY 2020-03-15 2020-03-15 3
## state earliest_report earliest_death earliest_number
## 1 OH 2020-03-20 2020-03-20 1
## state earliest_report earliest_death earliest_number
## 1 OK 2020-03-19 2020-03-19 1
## state earliest_report earliest_death earliest_number
## 1 OR 2020-03-18 2020-03-18 3
## state earliest_report earliest_death earliest_number
## 1 PA 2020-03-18 2020-03-18 1
## state earliest_report earliest_death earliest_number
## 1 PR 2020-03-22 2020-03-22 1
## state earliest_report earliest_death earliest_number
## 1 RI 2020-03-29 2020-03-29 3
## state earliest_report earliest_death earliest_number
## 1 SC 2020-03-16 2020-03-16 1
## state earliest_report earliest_death earliest_number
## 1 SD 2020-03-18 2020-03-18 1
## state earliest_report earliest_death earliest_number
## 1 TN 2020-03-23 2020-03-23 2
## state earliest_report earliest_death earliest_number
## 1 TX 2020-03-17 2020-03-17 1
## state earliest_report earliest_death earliest_number
## 1 UT 2020-03-15 2020-03-22 1
## state earliest_report earliest_death earliest_number
## 1 VA 2020-03-15 2020-03-15 1
## state earliest_report earliest_death earliest_number
## 1 VI 2020-04-05 2020-04-05 1
## state earliest_report earliest_death earliest_number
## 1 VT 2020-03-20 2020-03-20 2
## state earliest_report earliest_death earliest_number
## 1 WA 2020-02-26 2020-02-26 2
## state earliest_report earliest_death earliest_number
## 1 WI 2020-03-20 2020-03-20 3
## state earliest_report earliest_death earliest_number
## 1 WV 2020-03-15 2020-03-30 1
## state earliest_report earliest_death earliest_number
## 1 WY 2020-03-27 2020-04-13 1
step = 4
for (i in seq(from = 1, to = length(plot_list), by = step)) {
group <- plot_list[i:(i+step-1)]
#print(str(group))
print(plot_grid(plotlist = group, ncol = 2,labels = names(group)))
}
Now let’s look at overall population stats
pop <- GET("https://api.census.gov/data/2019/pep/population?get=DATE_CODE,POP,NAME&for=STATE:*")
popdata <- content(pop, as = "parsed")
dt2 <- rbindlist(popdata[2:length(popdata)])
colnames(dt2) <- unlist(popdata[[1]])
dt2 <- dt2[dt2$DATE_CODE == 12,-c("DATE_CODE")]
dt2$POP <- as.numeric(dt2$POP)
dt2$state <- as.factor(dt2$state)
names(dt2)[names(dt2) == "state"] <- "fips"
dt2[order(dt2$NAME),]
ggplot(dt2, aes(x=NAME,y=POP), fill=NAME) + geom_bar(stat="identity")
dt3 <- merge(dt,dt2, by="fips")
Now we can calculate deaths per million of population.
dt3[["dpm"]] <- dt3$death / dt3$POP * 1e6
dt3
Now lets, foe each fips code, find the earliest day where dpm is >= 1 and then what is the dpm 21 days after that point.
state_df <- data.frame(
state=character(0),
location_name=character(0),
date1m=character(0),
dpm21d=numeric(0)
)
str(state_df)
## 'data.frame': 0 obs. of 4 variables:
## $ state : chr
## $ location_name: chr
## $ date1m : chr
## $ dpm21d : num
for (code in levels(dt3$fips)) {
if (length(dt3[dt3$fips == code,]$date) == 0) {
break
}
print(code)
#print(dt3[dt3$fips == code,]$date)
by_state <- dt3[dt3$fips == code,]
date_order <- order(by_state$date)
by_state_ordered_by_date <- by_state[date_order,]
a <- setnafill(by_state_ordered_by_date,type="locf", cols=c("dpm"))
b <- setnafill(by_state_ordered_by_date,type="const", fill=0, cols=c("dpm"))
k <- Position(function(x) x > 1, by_state_ordered_by_date$dpm)
k21 <- k + 21
print(by_state_ordered_by_date[k,]$date)
obs <- list(
state=by_state_ordered_by_date[k,]$state,
location_name=by_state_ordered_by_date[k,]$NAME,
date1m=as.character(by_state_ordered_by_date[k,]$date),
dpm21d= by_state_ordered_by_date[k21,]$dpm
)
state_df <- rbind(state_df,obs)
}
## [1] "01"
## [1] "2020-03-30"
## [1] "02"
## [1] "2020-03-25"
## [1] "04"
## [1] "2020-03-26"
## [1] "05"
## [1] "2020-03-28"
## [1] "06"
## [1] "2020-03-24"
## [1] "08"
## [1] "2020-03-23"
## [1] "09"
## [1] "2020-03-21"
## [1] "10"
## [1] "2020-03-24"
## [1] "11"
## [1] "2020-03-20"
## [1] "12"
## [1] "2020-03-25"
## [1] "13"
## [1] "2020-03-20"
## [1] "15"
## [1] "2020-04-03"
## [1] "16"
## [1] "2020-03-27"
## [1] "17"
## [1] "2020-03-24"
## [1] "18"
## [1] "2020-03-23"
## [1] "19"
## [1] "2020-03-29"
## [1] "20"
## [1] "2020-03-25"
## [1] "21"
## [1] "2020-03-27"
## [1] "22"
## [1] "2020-03-18"
## [1] "23"
## [1] "2020-03-29"
## [1] "24"
## [1] "2020-03-26"
## [1] "25"
## [1] "2020-03-21"
## [1] "26"
## [1] "2020-03-19"
## [1] "27"
## [1] "2020-03-29"
## [1] "28"
## [1] "2020-03-26"
## [1] "29"
## [1] "2020-03-25"
## [1] "30"
## [1] "2020-03-30"
## [1] "31"
## [1] "2020-03-28"
## [1] "32"
## [1] "2020-03-21"
## [1] "33"
## [1] "2020-03-28"
## [1] "34"
## [1] "2020-03-19"
## [1] "35"
## [1] "2020-03-31"
## [1] "36"
## [1] "2020-03-20"
## [1] "37"
## [1] "2020-04-02"
## [1] "38"
## [1] "2020-03-28"
## [1] "39"
## [1] "2020-03-26"
## [1] "40"
## [1] "2020-03-25"
## [1] "41"
## [1] "2020-03-23"
## [1] "42"
## [1] "2020-03-26"
## [1] "44"
## [1] "2020-03-29"
## [1] "45"
## [1] "2020-03-25"
## [1] "46"
## [1] "2020-03-18"
## [1] "47"
## [1] "2020-03-29"
## [1] "48"
## [1] "2020-03-29"
## [1] "49"
## [1] "2020-03-30"
## [1] "50"
## [1] "2020-03-20"
## [1] "51"
## [1] "2020-03-25"
## [1] "53"
## [1] "2020-03-01"
## [1] "54"
## [1] "2020-04-02"
## [1] "55"
## [1] "2020-03-25"
## [1] "56"
## [1] "2020-04-13"
state_df <- state_df[complete.cases(state_df),]
state_dt <- data.table(state_df)
state_dt
data_dir <- paste(getwd(), "data", sep='/')
temp <- tempfile()
imhe_source <- "https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip"
download.file(imhe_source, temp, mode="wb")
data_file <- paste(data_dir, "Summary_stats_all_locs.csv", sep='/')
unzip(temp, exdir=data_dir, junkpaths=TRUE)
dd <- read.table(data_file, sep=",", header=T)
unlink(temp)
#dd_ss <- dd[,c("location_name", "any_business_start_date","all_non.ess_business_start_date")]
dd_ss <- dd[,c("location_name", "any_business_start_date")]
dd_ss <- dd
state_dt2 <- merge(state_dt, dd_ss, by="location_name")
#head(by_state_ordered_by_date)
state_dt2$days_ld <- as.numeric(as.Date(state_dt2$any_business_start_date, '%Y-%m-%d') - as.Date(state_dt2$date1m))
state_dt2
ggplot(state_dt2, aes(x=days_ld, y=dpm21d)) + geom_point(shape=1)
## Warning: Removed 1 rows containing missing values (geom_point).